# import Libraries
import pandas as pd # For data Analysis
import numpy as np # working with arrays
from sklearn.preprocessing import LabelEncoder #to normalize labels
import matplotlib.pyplot as plt #for visualization
import seaborn as sns # for visualizing
import plotly.express as px
from google.colab import drive
drive.mount("/content/drive")
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
data=pd.read_csv("/content/drive/MyDrive/DTR dataset/Movie_regression.xls")
data
| Marketing expense | Production expense | Multiplex coverage | Budget | Movie_length | Lead_ Actor_Rating | Lead_Actress_rating | Director_rating | Producer_rating | Critic_rating | Trailer_views | 3D_available | Time_taken | Twitter_hastags | Genre | Avg_age_actors | Num_multiplex | Collection | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20.1264 | 59.62 | 0.462 | 36524.125 | 138.7 | 7.825 | 8.095 | 7.910 | 7.995 | 7.94 | 527367 | YES | 109.60 | 223.840 | Thriller | 23 | 494 | 48000 |
| 1 | 20.5462 | 69.14 | 0.531 | 35668.655 | 152.4 | 7.505 | 7.650 | 7.440 | 7.470 | 7.44 | 494055 | NO | 146.64 | 243.456 | Drama | 42 | 462 | 43200 |
| 2 | 20.5458 | 69.14 | 0.531 | 39912.675 | 134.6 | 7.485 | 7.570 | 7.495 | 7.515 | 7.44 | 547051 | NO | 147.88 | 2022.400 | Comedy | 38 | 458 | 69400 |
| 3 | 20.6474 | 59.36 | 0.542 | 38873.890 | 119.3 | 6.895 | 7.035 | 6.920 | 7.020 | 8.26 | 516279 | YES | 185.36 | 225.344 | Drama | 45 | 472 | 66800 |
| 4 | 21.3810 | 59.36 | 0.542 | 39701.585 | 127.7 | 6.920 | 7.070 | 6.815 | 7.070 | 8.26 | 531448 | NO | 176.48 | 225.792 | Drama | 55 | 395 | 72400 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 501 | 21.2526 | 78.86 | 0.427 | 36624.115 | 142.6 | 8.680 | 8.775 | 8.620 | 8.970 | 6.80 | 492480 | NO | 186.96 | 243.584 | Action | 27 | 561 | 44800 |
| 502 | 20.9054 | 78.86 | 0.427 | 33996.600 | 150.2 | 8.780 | 8.945 | 8.770 | 8.930 | 7.80 | 482875 | YES | 132.24 | 263.296 | Action | 20 | 600 | 41200 |
| 503 | 21.2152 | 78.86 | 0.427 | 38751.680 | 164.5 | 8.830 | 8.970 | 8.855 | 9.010 | 7.80 | 532239 | NO | 109.56 | 243.824 | Comedy | 31 | 576 | 47800 |
| 504 | 22.1918 | 78.86 | 0.427 | 37740.670 | 162.8 | 8.730 | 8.845 | 8.800 | 8.845 | 6.80 | 496077 | YES | 158.80 | 303.520 | Comedy | 47 | 607 | 44000 |
| 505 | 20.9482 | 78.86 | 0.427 | 33496.650 | 154.3 | 8.640 | 8.880 | 8.680 | 8.790 | 6.80 | 518438 | YES | 205.60 | 203.040 | Comedy | 45 | 604 | 38000 |
506 rows × 18 columns
data.head(10)
| Marketing expense | Production expense | Multiplex coverage | Budget | Movie_length | Lead_ Actor_Rating | Lead_Actress_rating | Director_rating | Producer_rating | Critic_rating | Trailer_views | 3D_available | Time_taken | Twitter_hastags | Genre | Avg_age_actors | Num_multiplex | Collection | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20.1264 | 59.62 | 0.462 | 36524.125 | 138.7 | 7.825 | 8.095 | 7.910 | 7.995 | 7.94 | 527367 | YES | 109.60 | 223.840 | Thriller | 23 | 494 | 48000 |
| 1 | 20.5462 | 69.14 | 0.531 | 35668.655 | 152.4 | 7.505 | 7.650 | 7.440 | 7.470 | 7.44 | 494055 | NO | 146.64 | 243.456 | Drama | 42 | 462 | 43200 |
| 2 | 20.5458 | 69.14 | 0.531 | 39912.675 | 134.6 | 7.485 | 7.570 | 7.495 | 7.515 | 7.44 | 547051 | NO | 147.88 | 2022.400 | Comedy | 38 | 458 | 69400 |
| 3 | 20.6474 | 59.36 | 0.542 | 38873.890 | 119.3 | 6.895 | 7.035 | 6.920 | 7.020 | 8.26 | 516279 | YES | 185.36 | 225.344 | Drama | 45 | 472 | 66800 |
| 4 | 21.3810 | 59.36 | 0.542 | 39701.585 | 127.7 | 6.920 | 7.070 | 6.815 | 7.070 | 8.26 | 531448 | NO | 176.48 | 225.792 | Drama | 55 | 395 | 72400 |
| 5 | 20.5970 | 59.36 | 0.542 | 35718.650 | 132.2 | 6.890 | 7.100 | 6.885 | 7.005 | 7.26 | 498425 | YES | 143.48 | 284.592 | Comedy | 53 | 460 | 57400 |
| 6 | 21.7658 | 70.74 | 0.476 | 33396.660 | 140.1 | 7.065 | 7.265 | 7.150 | 7.400 | 8.96 | 459241 | YES | 139.16 | 243.664 | Thriller | 41 | 522 | 45800 |
| 7 | 22.8910 | 70.74 | 0.476 | 34285.460 | 169.6 | 6.980 | 7.075 | 6.875 | 7.170 | 7.96 | 400821 | NO | 116.84 | 243.536 | Drama | 56 | 571 | 44200 |
| 8 | 24.2248 | 70.74 | 0.476 | 31280.205 | 173.5 | 6.910 | 7.075 | 6.850 | 7.000 | 7.96 | 295168 | YES | 118.60 | 242.640 | Comedy | 55 | 564 | 33000 |
| 9 | 23.4008 | 70.74 | 0.476 | 33352.220 | 159.4 | 6.665 | 6.725 | 6.575 | 6.855 | 7.96 | 412012 | YES | 189.56 | 283.024 | Thriller | 45 | 508 | 37800 |
data.tail(3)
| Marketing expense | Production expense | Multiplex coverage | Budget | Movie_length | Lead_ Actor_Rating | Lead_Actress_rating | Director_rating | Producer_rating | Critic_rating | Trailer_views | 3D_available | Time_taken | Twitter_hastags | Genre | Avg_age_actors | Num_multiplex | Collection | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 503 | 21.2152 | 78.86 | 0.427 | 38751.68 | 164.5 | 8.83 | 8.970 | 8.855 | 9.010 | 7.8 | 532239 | NO | 109.56 | 243.824 | Comedy | 31 | 576 | 47800 |
| 504 | 22.1918 | 78.86 | 0.427 | 37740.67 | 162.8 | 8.73 | 8.845 | 8.800 | 8.845 | 6.8 | 496077 | YES | 158.80 | 303.520 | Comedy | 47 | 607 | 44000 |
| 505 | 20.9482 | 78.86 | 0.427 | 33496.65 | 154.3 | 8.64 | 8.880 | 8.680 | 8.790 | 6.8 | 518438 | YES | 205.60 | 203.040 | Comedy | 45 | 604 | 38000 |
data.shape
(506, 18)
data.size
9108
data.dtypes
Marketing expense float64 Production expense float64 Multiplex coverage float64 Budget float64 Movie_length float64 Lead_ Actor_Rating float64 Lead_Actress_rating float64 Director_rating float64 Producer_rating float64 Critic_rating float64 Trailer_views int64 3D_available object Time_taken float64 Twitter_hastags float64 Genre object Avg_age_actors int64 Num_multiplex int64 Collection int64 dtype: object
data.select_dtypes("object")
| 3D_available | Genre | |
|---|---|---|
| 0 | YES | Thriller |
| 1 | NO | Drama |
| 2 | NO | Comedy |
| 3 | YES | Drama |
| 4 | NO | Drama |
| ... | ... | ... |
| 501 | NO | Action |
| 502 | YES | Action |
| 503 | NO | Comedy |
| 504 | YES | Comedy |
| 505 | YES | Comedy |
506 rows × 2 columns
data.columns
Index(['Marketing expense', 'Production expense', 'Multiplex coverage',
'Budget', 'Movie_length', 'Lead_ Actor_Rating', 'Lead_Actress_rating',
'Director_rating', 'Producer_rating', 'Critic_rating', 'Trailer_views',
'3D_available', 'Time_taken', 'Twitter_hastags', 'Genre',
'Avg_age_actors', 'Num_multiplex', 'Collection'],
dtype='object')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Marketing expense 506 non-null float64 1 Production expense 506 non-null float64 2 Multiplex coverage 506 non-null float64 3 Budget 506 non-null float64 4 Movie_length 506 non-null float64 5 Lead_ Actor_Rating 506 non-null float64 6 Lead_Actress_rating 506 non-null float64 7 Director_rating 506 non-null float64 8 Producer_rating 506 non-null float64 9 Critic_rating 506 non-null float64 10 Trailer_views 506 non-null int64 11 3D_available 506 non-null object 12 Time_taken 494 non-null float64 13 Twitter_hastags 506 non-null float64 14 Genre 506 non-null object 15 Avg_age_actors 506 non-null int64 16 Num_multiplex 506 non-null int64 17 Collection 506 non-null int64 dtypes: float64(12), int64(4), object(2) memory usage: 71.3+ KB
data.describe()
| Marketing expense | Production expense | Multiplex coverage | Budget | Movie_length | Lead_ Actor_Rating | Lead_Actress_rating | Director_rating | Producer_rating | Critic_rating | Trailer_views | Time_taken | Twitter_hastags | Avg_age_actors | Num_multiplex | Collection | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 494.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 |
| mean | 92.270471 | 77.273557 | 0.445305 | 34911.144022 | 142.074901 | 8.014002 | 8.185613 | 8.019664 | 8.190514 | 7.810870 | 449860.715415 | 157.391498 | 260.832095 | 39.181818 | 545.043478 | 45057.707510 |
| std | 172.030902 | 13.720706 | 0.115878 | 3903.038232 | 28.148861 | 1.054266 | 1.054290 | 1.059899 | 1.049601 | 0.659699 | 68917.763145 | 31.295161 | 104.779133 | 12.513697 | 106.332889 | 18364.351764 |
| min | 20.126400 | 55.920000 | 0.129000 | 19781.355000 | 76.400000 | 3.840000 | 4.035000 | 3.840000 | 4.030000 | 6.600000 | 212912.000000 | 0.000000 | 201.152000 | 3.000000 | 333.000000 | 10000.000000 |
| 25% | 21.640900 | 65.380000 | 0.376000 | 32693.952500 | 118.525000 | 7.316250 | 7.503750 | 7.296250 | 7.507500 | 7.200000 | 409128.000000 | 132.300000 | 223.796000 | 28.000000 | 465.000000 | 34050.000000 |
| 50% | 25.130200 | 74.380000 | 0.462000 | 34488.217500 | 151.000000 | 8.307500 | 8.495000 | 8.312500 | 8.465000 | 7.960000 | 462460.000000 | 160.000000 | 254.400000 | 39.000000 | 535.500000 | 42400.000000 |
| 75% | 93.541650 | 91.200000 | 0.551000 | 36793.542500 | 167.575000 | 8.865000 | 9.030000 | 8.883750 | 9.030000 | 8.260000 | 500247.500000 | 181.890000 | 283.416000 | 50.000000 | 614.750000 | 50000.000000 |
| max | 1799.524000 | 110.480000 | 0.615000 | 48772.900000 | 173.500000 | 9.435000 | 9.540000 | 9.425000 | 9.635000 | 9.400000 | 567784.000000 | 217.520000 | 2022.400000 | 60.000000 | 868.000000 | 100000.000000 |
data.duplicated().sum()
0
data.isna().sum()
Marketing expense 0 Production expense 0 Multiplex coverage 0 Budget 0 Movie_length 0 Lead_ Actor_Rating 0 Lead_Actress_rating 0 Director_rating 0 Producer_rating 0 Critic_rating 0 Trailer_views 0 3D_available 0 Time_taken 12 Twitter_hastags 0 Genre 0 Avg_age_actors 0 Num_multiplex 0 Collection 0 dtype: int64
data.corr()
| Marketing expense | Production expense | Multiplex coverage | Budget | Movie_length | Lead_ Actor_Rating | Lead_Actress_rating | Director_rating | Producer_rating | Critic_rating | Trailer_views | Time_taken | Twitter_hastags | Avg_age_actors | Num_multiplex | Collection | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Marketing expense | 1.000000 | 0.406583 | -0.420972 | -0.219247 | 0.352734 | 0.380050 | 0.379813 | 0.380069 | 0.376462 | -0.184985 | -0.443457 | 0.026019 | 0.013518 | 0.059204 | 0.383298 | -0.389582 |
| Production expense | 0.406583 | 1.000000 | -0.763651 | -0.391676 | 0.644779 | 0.706481 | 0.707956 | 0.707566 | 0.705819 | -0.251565 | -0.591657 | 0.015888 | -0.000839 | 0.055810 | 0.707559 | -0.484754 |
| Multiplex coverage | -0.420972 | -0.763651 | 1.000000 | 0.302188 | -0.731470 | -0.768589 | -0.769724 | -0.769157 | -0.764873 | 0.145555 | 0.581386 | 0.035922 | 0.004882 | -0.092104 | -0.915495 | 0.429300 |
| Budget | -0.219247 | -0.391676 | 0.302188 | 1.000000 | -0.240265 | -0.208464 | -0.203981 | -0.201907 | -0.205397 | 0.232361 | 0.602536 | 0.040773 | 0.030674 | -0.064694 | -0.282796 | 0.696304 |
| Movie_length | 0.352734 | 0.644779 | -0.731470 | -0.240265 | 1.000000 | 0.746904 | 0.746493 | 0.747021 | 0.746707 | -0.217830 | -0.589318 | -0.019984 | 0.009380 | 0.075198 | 0.673896 | -0.377999 |
| Lead_ Actor_Rating | 0.380050 | 0.706481 | -0.768589 | -0.208464 | 0.746904 | 1.000000 | 0.997905 | 0.997735 | 0.994073 | -0.169978 | -0.490267 | 0.038494 | 0.014463 | 0.036794 | 0.706331 | -0.251355 |
| Lead_Actress_rating | 0.379813 | 0.707956 | -0.769724 | -0.203981 | 0.746493 | 0.997905 | 1.000000 | 0.998097 | 0.994003 | -0.165992 | -0.487536 | 0.038432 | 0.010239 | 0.038005 | 0.708257 | -0.249459 |
| Director_rating | 0.380069 | 0.707566 | -0.769157 | -0.201907 | 0.747021 | 0.997735 | 0.998097 | 1.000000 | 0.994126 | -0.166638 | -0.486452 | 0.036300 | 0.010077 | 0.041470 | 0.709364 | -0.246650 |
| Producer_rating | 0.376462 | 0.705819 | -0.764873 | -0.205397 | 0.746707 | 0.994073 | 0.994003 | 0.994126 | 1.000000 | -0.167003 | -0.487911 | 0.028988 | 0.005850 | 0.032542 | 0.703518 | -0.248200 |
| Critic_rating | -0.184985 | -0.251565 | 0.145555 | 0.232361 | -0.217830 | -0.169978 | -0.165992 | -0.166638 | -0.167003 | 1.000000 | 0.228641 | -0.015033 | -0.023655 | -0.049797 | -0.128769 | 0.341288 |
| Trailer_views | -0.443457 | -0.591657 | 0.581386 | 0.602536 | -0.589318 | -0.490267 | -0.487536 | -0.486452 | -0.487911 | 0.228641 | 1.000000 | 0.075145 | -0.006704 | -0.049726 | -0.544100 | 0.720119 |
| Time_taken | 0.026019 | 0.015888 | 0.035922 | 0.040773 | -0.019984 | 0.038494 | 0.038432 | 0.036300 | 0.028988 | -0.015033 | 0.075145 | 1.000000 | -0.006390 | 0.072849 | -0.057580 | 0.111092 |
| Twitter_hastags | 0.013518 | -0.000839 | 0.004882 | 0.030674 | 0.009380 | 0.014463 | 0.010239 | 0.010077 | 0.005850 | -0.023655 | -0.006704 | -0.006390 | 1.000000 | -0.004840 | 0.006255 | 0.023122 |
| Avg_age_actors | 0.059204 | 0.055810 | -0.092104 | -0.064694 | 0.075198 | 0.036794 | 0.038005 | 0.041470 | 0.032542 | -0.049797 | -0.049726 | 0.072849 | -0.004840 | 1.000000 | 0.078811 | -0.047426 |
| Num_multiplex | 0.383298 | 0.707559 | -0.915495 | -0.282796 | 0.673896 | 0.706331 | 0.708257 | 0.709364 | 0.703518 | -0.128769 | -0.544100 | -0.057580 | 0.006255 | 0.078811 | 1.000000 | -0.391729 |
| Collection | -0.389582 | -0.484754 | 0.429300 | 0.696304 | -0.377999 | -0.251355 | -0.249459 | -0.246650 | -0.248200 | 0.341288 | 0.720119 | 0.111092 | 0.023122 | -0.047426 | -0.391729 | 1.000000 |
data.skew()
<ipython-input-131-b3b431164adb>:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
Marketing expense 5.223149 Production expense 0.295022 Multiplex coverage -0.729308 Budget 0.403612 Movie_length -0.598963 Lead_ Actor_Rating -1.010577 Lead_Actress_rating -1.007492 Director_rating -1.003848 Producer_rating -1.004680 Critic_rating 0.176139 Trailer_views -0.843831 Time_taken -0.473481 Twitter_hastags 13.790552 Avg_age_actors 0.012971 Num_multiplex 0.534221 Collection 1.110912 dtype: float64
data['3D_available'].value_counts()
YES 279 NO 227 Name: 3D_available, dtype: int64
sns.countplot(x = '3D_available',data=data)
plt.show()
fig = px.histogram(data, 'Collection',
color="3D_available",
title="<b>Average Collection</b>")
fig.add_vline(x=data['Collection'].mean(), line_width=2, line_dash="dash", line_color="black")
fig.show()
#For collections morethan 50k majority of the collection came for 3D movies
data['Genre'].value_counts()
Thriller 183 Comedy 155 Drama 97 Action 71 Name: Genre, dtype: int64
sns.countplot(x = 'Genre',data=data)
plt.show()
fig = px.histogram(data, 'Collection',
color="Genre",
title="<b>Average Collection</b>")
fig.add_vline(x=data['Collection'].mean(), line_width=2, line_dash="dash", line_color="black")
fig.show()
plt.figure(figsize=(6,8))
x = data.drop(['3D_available','Genre'],axis = 1)
for i in x.columns:
sns.histplot(x[i],kde = True)
plt.show()
plt.figure(figsize=(6,8))
x = data.drop(['3D_available','Genre'],axis = 1)
for i in x.columns:
sns.scatterplot(x = 'Collection',y = i,data = data,color = 'Red')
plt.show()
plt.figure(figsize=(16,9))
x = data.drop(['3D_available','Genre'],axis = 1)
ax = sns.heatmap(x.corr(),annot = True,cmap = 'viridis')
plt.show()
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x7f8d5d57a730>
x = data.drop(['3D_available','Genre'],axis = 1)
for i in x.columns:
sns.boxplot(x = i, data = x,color = 'yellowgreen')
plt.xlabel(i)
plt.show()
def count_outliers(data,col):
q1 = data[col].quantile(0.25,interpolation='nearest')
q2 = data[col].quantile(0.5,interpolation='nearest')
q3 = data[col].quantile(0.75,interpolation='nearest')
q4 = data[col].quantile(1,interpolation='nearest')
IQR = q3 -q1
global LLP
global ULP
LLP = q1 - 1.5*IQR
ULP = q3 + 1.5*IQR
if data[col].min() > LLP and data[col].max() < ULP:
print("No outliers in",i)
else:
print("There are outliers in",i)
x = data[data[col]<LLP][col].size
y = data[data[col]>ULP][col].size
a.append(i)
print('Count of outliers are:',x+y)
global a
a = []
for i in x.columns:
count_outliers(x,i)
There are outliers in Marketing expense Count of outliers are: 66 No outliers in Production expense No outliers in Multiplex coverage There are outliers in Budget Count of outliers are: 30 No outliers in Movie_length There are outliers in Lead_ Actor_Rating Count of outliers are: 5 There are outliers in Lead_Actress_rating Count of outliers are: 5 There are outliers in Director_rating Count of outliers are: 5 There are outliers in Producer_rating Count of outliers are: 5 No outliers in Critic_rating There are outliers in Trailer_views Count of outliers are: 10 There are outliers in Time_taken Count of outliers are: 2 There are outliers in Twitter_hastags Count of outliers are: 2 No outliers in Avg_age_actors There are outliers in Num_multiplex Count of outliers are: 3 There are outliers in Collection Count of outliers are: 37
data.isnull().sum()
Marketing expense 0 Production expense 0 Multiplex coverage 0 Budget 0 Movie_length 0 Lead_ Actor_Rating 0 Lead_Actress_rating 0 Director_rating 0 Producer_rating 0 Critic_rating 0 Trailer_views 0 3D_available 0 Time_taken 12 Twitter_hastags 0 Genre 0 Avg_age_actors 0 Num_multiplex 0 Collection 0 dtype: int64
# Since there are outliers in time_taken column we should replace null with median
data['Time_taken'].fillna(data['Time_taken'].median(),inplace=True)
data.isnull().sum()
Marketing expense 0 Production expense 0 Multiplex coverage 0 Budget 0 Movie_length 0 Lead_ Actor_Rating 0 Lead_Actress_rating 0 Director_rating 0 Producer_rating 0 Critic_rating 0 Trailer_views 0 3D_available 0 Time_taken 0 Twitter_hastags 0 Genre 0 Avg_age_actors 0 Num_multiplex 0 Collection 0 dtype: int64
label_2=pd.get_dummies(data=data,columns=['Genre','3D_available'],drop_first=True)
label_2
| Marketing expense | Production expense | Multiplex coverage | Budget | Movie_length | Lead_ Actor_Rating | Lead_Actress_rating | Director_rating | Producer_rating | Critic_rating | Trailer_views | Time_taken | Twitter_hastags | Avg_age_actors | Num_multiplex | Collection | Genre_Comedy | Genre_Drama | Genre_Thriller | 3D_available_YES | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20.1264 | 59.62 | 0.462 | 36524.125 | 138.7 | 7.825 | 8.095 | 7.910 | 7.995 | 7.94 | 527367 | 109.60 | 223.840 | 23 | 494 | 48000 | 0 | 0 | 1 | 1 |
| 1 | 20.5462 | 69.14 | 0.531 | 35668.655 | 152.4 | 7.505 | 7.650 | 7.440 | 7.470 | 7.44 | 494055 | 146.64 | 243.456 | 42 | 462 | 43200 | 0 | 1 | 0 | 0 |
| 2 | 20.5458 | 69.14 | 0.531 | 39912.675 | 134.6 | 7.485 | 7.570 | 7.495 | 7.515 | 7.44 | 547051 | 147.88 | 2022.400 | 38 | 458 | 69400 | 1 | 0 | 0 | 0 |
| 3 | 20.6474 | 59.36 | 0.542 | 38873.890 | 119.3 | 6.895 | 7.035 | 6.920 | 7.020 | 8.26 | 516279 | 185.36 | 225.344 | 45 | 472 | 66800 | 0 | 1 | 0 | 1 |
| 4 | 21.3810 | 59.36 | 0.542 | 39701.585 | 127.7 | 6.920 | 7.070 | 6.815 | 7.070 | 8.26 | 531448 | 176.48 | 225.792 | 55 | 395 | 72400 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 501 | 21.2526 | 78.86 | 0.427 | 36624.115 | 142.6 | 8.680 | 8.775 | 8.620 | 8.970 | 6.80 | 492480 | 186.96 | 243.584 | 27 | 561 | 44800 | 0 | 0 | 0 | 0 |
| 502 | 20.9054 | 78.86 | 0.427 | 33996.600 | 150.2 | 8.780 | 8.945 | 8.770 | 8.930 | 7.80 | 482875 | 132.24 | 263.296 | 20 | 600 | 41200 | 0 | 0 | 0 | 1 |
| 503 | 21.2152 | 78.86 | 0.427 | 38751.680 | 164.5 | 8.830 | 8.970 | 8.855 | 9.010 | 7.80 | 532239 | 109.56 | 243.824 | 31 | 576 | 47800 | 1 | 0 | 0 | 0 |
| 504 | 22.1918 | 78.86 | 0.427 | 37740.670 | 162.8 | 8.730 | 8.845 | 8.800 | 8.845 | 6.80 | 496077 | 158.80 | 303.520 | 47 | 607 | 44000 | 1 | 0 | 0 | 1 |
| 505 | 20.9482 | 78.86 | 0.427 | 33496.650 | 154.3 | 8.640 | 8.880 | 8.680 | 8.790 | 6.80 | 518438 | 205.60 | 203.040 | 45 | 604 | 38000 | 1 | 0 | 0 | 1 |
506 rows × 20 columns
label_2.shape
(506, 20)
from sklearn.model_selection import train_test_split
X = label_2.drop(['Collection'],axis = 1)
Y = label_2['Collection']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3,random_state=44)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3,random_state=44)
X.shape
(506, 19)
Y.shape
(506,)
X_train.shape
(354, 19)
X_test.shape
(152, 19)
from sklearn.tree import DecisionTreeRegressor
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, Y_train)
DecisionTreeRegressor(random_state=0)
pred = regressor.predict(X_test)
pred
array([ 40800., 30400., 38000., 55800., 33600., 72200., 46000.,
45200., 48800., 40400., 42800., 41200., 45200., 43200.,
50000., 11200., 35600., 45200., 43000., 33400., 59200.,
50200., 100000., 38000., 38200., 42000., 21000., 40600.,
30800., 61400., 29600., 39800., 31200., 35600., 40800.,
34800., 82600., 72000., 55000., 45600., 30400., 35000.,
55000., 42000., 23400., 31200., 65800., 63000., 34600.,
40800., 63200., 69800., 28200., 17000., 56800., 26600.,
64000., 100000., 26600., 37400., 34400., 19000., 28200.,
33600., 47600., 44600., 44400., 38400., 59200., 45600.,
39200., 38200., 100000., 40800., 34800., 31200., 41800.,
82600., 79600., 38000., 46800., 41800., 40800., 72400.,
30400., 41200., 33400., 46600., 47200., 12600., 47600.,
30400., 41200., 30000., 33000., 45800., 97600., 33600.,
32800., 38600., 48400., 45400., 42000., 19200., 38000.,
43400., 35600., 40000., 32800., 47400., 45600., 39600.,
79600., 72800., 48400., 42400., 72200., 92000., 24600.,
44600., 100000., 28800., 47800., 40800., 48200., 66400.,
33200., 35600., 39200., 42400., 34000., 45200., 28000.,
33600., 37200., 49000., 69400., 46000., 31200., 28800.,
37000., 30400., 33200., 84600., 46600., 52400., 63200.,
36400., 100000., 44000., 32400., 39800.])
regressor.score(X_train,Y_train)
1.0
regressor.score(X_test,Y_test)
0.7641204466003866
We got 100% score on training data.
On test data we got 76.5% score because we did not provide any tuning parameters while intializing the tree as a result of which algorithm split the training data till the leaf node. Due to which depth of tree increased and our model did the overfitting.
That's why we are getting high score on our training data and less score on test data.
So to solve this problem we would use hyper parameter tuning.
We can use GridSearch or RandomizedSearch for hyper parameters tuning.
plt.scatter(Y_test,pred)
plt.xlabel('Y Test (True Values)')
plt.ylabel('Predicted values')
plt.show()
from sklearn import metrics
print('MAE',metrics.mean_absolute_error(Y_test,pred))
print('MSE',metrics.mean_squared_error(Y_test,pred))
print('RMSE',np.sqrt(metrics.mean_squared_error(Y_test,pred)))
MAE 6409.210526315789 MSE 81275526.31578948 RMSE 9015.29402270328
#r2 score
metrics.explained_variance_score(Y_test,pred)
0.765079993476032
# Curve is distributed normally so model is ok
sns.displot(Y_test-pred,bins = 50,kde = True)
<seaborn.axisgrid.FacetGrid at 0x7f8d58e51a90>
We are getting nearly bell shape curve that means our model working good? No we can't make that conclusion. Good bell curve only tell us the range of predicted values are with in the same range as our original data range values are.
# Hyper parameters range intialization for tuning
parameters={"splitter":["best","random"],
"max_depth" : [1,3,5,7,9,11,12],
"min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
"min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
"max_features":["auto","log2","sqrt",None],
"max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] }
Above we intialized hyperparmeters random range using Gridsearch to find the best parameters for our decision tree model.
# calculating different regression metrics
from sklearn.model_selection import GridSearchCV
tuning_model=GridSearchCV(regressor,param_grid=parameters,scoring='neg_mean_squared_error',cv=3,verbose=3)
# function for calculating how much time take for hyperparameter tuning
def timer(start_time=None):
if not start_time:
start_time=datetime.now()
return start_time
elif start_time:
thour,temp_sec=divmod((datetime.now()-start_time).total_seconds(),3600)
tmin,tsec=divmod(temp_sec,60)
print(thour,":",tmin,':',round(tsec,2))
%%capture
from datetime import datetime
start_time=timer(None)
tuning_model.fit(X,Y)
timer(start_time)
Hyper parameter tuning took around 13 minues. It might vary depending upon your machine.
# best hyperparameters
tuning_model.best_params_
{'max_depth': 3,
'max_features': 'auto',
'max_leaf_nodes': None,
'min_samples_leaf': 1,
'min_weight_fraction_leaf': 0.1,
'splitter': 'best'}
# best model score
tuning_model.best_score_
-158157198.0285828
tuned_hyper_model= DecisionTreeRegressor(max_depth=3,max_features='auto',max_leaf_nodes=None,min_samples_leaf=1,min_weight_fraction_leaf=0.1,splitter='best')
# fitting model
tuned_hyper_model.fit(X_train,Y_train)
DecisionTreeRegressor(max_depth=3, max_features='auto',
min_weight_fraction_leaf=0.1)
# prediction
tuned_pred=tuned_hyper_model.predict(X_test)
plt.scatter(Y_test,tuned_pred)
<matplotlib.collections.PathCollection at 0x7f8d57788850>
Ok the above scatter plot looks lot better.
Let us compare now Error rate of our model with hyper tuning of paramerters to our original model which is without the tuning of parameters.
print('MAE:', metrics.mean_absolute_error(Y_test,tuned_pred))
print('MSE:', metrics.mean_squared_error(Y_test, tuned_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, tuned_pred)))
MAE: 8037.384198275319 MSE: 125674495.13882908 RMSE: 11210.463645132126
print('MAE',metrics.mean_absolute_error(Y_test,pred))
print('MSE',metrics.mean_squared_error(Y_test,pred))
print('RMSE',np.sqrt(metrics.mean_squared_error(Y_test,pred)))
MAE 6409.210526315789 MSE 81275526.31578948 RMSE 9015.29402270328
tuned_hyper_model.score(X_test,Y_test)
0.6352648191795232
metrics.explained_variance_score(Y_test,tuned_pred)
0.6357758970060626
If you observe the above metrics for both the models, We got good metric values(MSE 80750526.31578948) and 76.5% score without hyperparameter tuning model compare to model hyper parameter tuning.
pd.DataFrame({'Actual': Y_test, 'Predicted': pred.round(2)})
| Actual | Predicted | |
|---|---|---|
| 316 | 35600 | 40800.0 |
| 218 | 43000 | 30400.0 |
| 485 | 42400 | 38000.0 |
| 274 | 64800 | 55800.0 |
| 427 | 21800 | 33600.0 |
| ... | ... | ... |
| 254 | 43800 | 36400.0 |
| 232 | 83400 | 100000.0 |
| 289 | 49600 | 44000.0 |
| 15 | 39800 | 32400.0 |
| 365 | 55000 | 39800.0 |
152 rows × 2 columns
pd.DataFrame({'Actual': Y_test, 'Predicted': tuned_pred.round(2)})
| Actual | Predicted | |
|---|---|---|
| 316 | 35600 | 33512.90 |
| 218 | 43000 | 33512.90 |
| 485 | 42400 | 43927.78 |
| 274 | 64800 | 53922.22 |
| 427 | 21800 | 43927.78 |
| ... | ... | ... |
| 254 | 43800 | 43927.78 |
| 232 | 83400 | 72320.69 |
| 289 | 49600 | 53922.22 |
| 15 | 39800 | 43927.78 |
| 365 | 55000 | 43927.78 |
152 rows × 2 columns